Fix: After training the model, subtract the effect of the mean expression from the model score

Problems:

library(tidyverse)
library(knitr)
library(plotly) ; library(viridis) ; library(gridExtra) ; library(RColorBrewer)
library(Rtsne)
library(knitr)
library(ROCR)
library(expss)

SFARI_colour_hue = function(r) {
  pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}

Load data

# Regression data
load(file='./../Data/Gandal/logreg_model.RData')

# Mean Expression data
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame

# Dataset created with DynamicTreeMerged algorithm
clustering_selected = 'DynamicHybridMergedSmall'
print(paste0('Using clustering ', clustering_selected))
## [1] "Using clustering DynamicHybridMergedSmall"
original_dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'), row.names=1)

rm(dds, datGenes, datMeta)

Solution: Remove bias from scores with linear fit

The linear trend is gone, but there still remains some other non-linear trends

# 1. Fit line to prob ~ meanExpr
linear_fit = lm(prob ~ meanExpr, data=plot_data)
slope = linear_fit$coefficients[['meanExpr']]
print(paste0('Slope: ', slope))
## [1] "Slope: 0.0174544683661941"
# 2. Remove mean expression signal
remove_slope = function(df, slope){
  meanExpr = datExpr[rownames(df),] %>% rowMeans
  if(!all(rownames(meanExpr)==rownames(df))){
    print('Row names do not match!')
    stop()
  }
  unbiased_score = df$prob + slope*(mean(meanExpr) - meanExpr) # This distribution has the same mean as the original one
  return(unbiased_score)
}

negative_set$corrected_score = remove_slope(negative_set, slope)
test_set$corrected_score = remove_slope(test_set, slope)

plot_data = data.frame('ID'=rownames(datExpr), 'meanExpr'=rowMeans(datExpr)) %>% 
            right_join(negative_set %>% mutate(ID=rownames(negative_set)), by='ID')

plot_data %>% ggplot(aes(meanExpr, corrected_score)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + ylab('Corrected Score') + xlab('Mean Expression') +
              geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) + 
              theme_minimal() + ggtitle('Mean expression vs model score corrected by linear fit by gene')

rm(slope, linear_fit, remove_slope)

Performance Metrics

Confusion Matrix

conf_mat = test_set %>% mutate(corrected_pred = corrected_score>0.5) %>%
                        apply_labels(SFARI = 'Actual Labels', 
                                     corrected_score = 'Corrected Score', 
                                     corrected_pred = 'Corrected Label Prediction')

cro(conf_mat$SFARI, list(conf_mat$pred, total()))
 FALSE   TRUE   #Total 
 Actual Labels 
   FALSE  101 64 165
   TRUE  64 101 165
   #Total cases  165 165 330
rm(conf_mat)

Accuracy

Accuracy went up? It doesn’t make sense, maybe it’s because it’s a small sample

acc = mean(test_set$SFARI==(test_set$corrected_score>0.5))

print(paste0('Accuracy = ', round(acc,4)))
## [1] "Accuracy = 0.6152"
rm(acc)

ROC Curve

AUC decreased 0.01

pred_ROCR = prediction(test_set$corrected_score, test_set$SFARI)

roc_ROCR = performance(pred_ROCR, measure='tpr', x.measure='fpr')
AUC = performance(pred_ROCR, measure='auc')@y.values[[1]]

plot(roc_ROCR, main=paste0('ROC curve (AUC=',round(AUC,2),')'), col='#009999')
abline(a=0, b=1, col='#666666')

rm(roc_ROCR, AUC)

Lift Curve

lift_ROCR = performance(pred_ROCR, measure='lift', x.measure='rpp')
plot(lift_ROCR, main='Lift curve', col='#86b300')

rm(lift_ROCR, pred_ROCR)

Analyse Model

Looks very similar to before

plot_data = test_set %>% dplyr::select(corrected_score, SFARI)

plot_data %>% ggplot(aes(corrected_score, fill=SFARI, color=SFARI)) + geom_density(alpha=0.3) + xlab('Score') +
              theme_minimal() + ggtitle('Model score distribution by SFARI Label')

The positive relation between SFARI scores and Model scores is still there

plot_data = test_set %>% mutate(ID=rownames(test_set)) %>% dplyr::select(ID, corrected_score) %>%
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID') %>%
            dplyr::select(ID, corrected_score, gene.score) %>% apply_labels(gene.score='SFARI Gene score')

cro(plot_data$gene.score)
 #Total 
 SFARI Gene score 
   1  8
   2  11
   3  34
   4  76
   5  32
   6  4
   None  165
   #Total cases  330
mean_vals = plot_data %>% group_by(gene.score) %>% summarise(mean_corrected_score = mean(corrected_score))

plot_data %>% ggplot(aes(corrected_score, color=gene.score, fill=gene.score)) + geom_density(alpha=0.25) + 
              geom_vline(data=mean_vals, aes(xintercept=mean_corrected_score, color=gene.score), linetype='dashed') +
              scale_colour_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) +
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
              ggtitle('Distribution of the Model scores by SFARI score') +
              xlab('Corrected Score') + ylab('Density') + theme_minimal()

ggplotly(plot_data %>% ggplot(aes(gene.score, corrected_score, fill=gene.score)) + geom_boxplot() + 
              scale_fill_manual(values=SFARI_colour_hue(r=c(1:6,8,7))) + 
              ggtitle('Distribution of the Model scores by SFARI score') +
              xlab('SFARI score') + ylab('Model score') + theme_minimal())
rm(mean_vals)

Print genes with highest corrected scores in test set

  • First gene no longer has score of 1, so it probably was there because of its high level of expression

  • Not a single gene with score 6

  • Very few genes without a SFARI score

test_set %>% dplyr::select(corrected_score, SFARI) %>% mutate(ID = rownames(test_set)) %>% 
             arrange(desc(corrected_score)) %>% top_n(50, wt=corrected_score) %>%
             left_join(original_dataset %>% mutate(ID=rownames(original_dataset)), by='ID')  %>% 
             dplyr::select(ID, corrected_score, SFARI, gene.score, MTcor, matches(clustering_selected)) %>%
             kable
ID corrected_score SFARI gene.score MTcor DynamicHybridMergedSmall
ENSG00000152217 0.8427161 TRUE 3 0.2020738 #00B70C
ENSG00000139767 0.8346415 TRUE 5 -0.5362834 #00C0BB
ENSG00000127124 0.8218297 TRUE 3 0.2020738 #00B70C
ENSG00000162105 0.8193478 TRUE 2 -0.5362834 #00C0BB
ENSG00000118058 0.8177748 TRUE 1 0.2020738 #00B70C
ENSG00000147010 0.8147129 TRUE 5 0.2020738 #00B70C
ENSG00000119866 0.8018502 TRUE 2 -0.5362834 #00C0BB
ENSG00000049618 0.7997779 TRUE 1 0.2020738 #00B70C
ENSG00000183117 0.7986013 TRUE 4 -0.9216750 #C17FFF
ENSG00000125851 0.7924033 FALSE None -0.6557529 #53B400
ENSG00000128815 0.7896276 TRUE 3 0.6236224 #00BF7D
ENSG00000169057 0.7872776 TRUE 2 -0.6714074 #8195FF
ENSG00000133216 0.7850113 TRUE 4 0.4840783 #00B6EB
ENSG00000170579 0.7754854 TRUE 3 -0.9216750 #C17FFF
ENSG00000148737 0.7656936 TRUE 3 -0.5362834 #00C0BB
ENSG00000116539 0.7642164 TRUE 1 0.2020738 #00B70C
ENSG00000197892 0.7563702 TRUE 4 0.2020738 #00B70C
ENSG00000185345 0.7550436 TRUE 3 -0.9216750 #C17FFF
ENSG00000176697 0.7491887 TRUE 5 0.4840783 #00B6EB
ENSG00000082701 0.7485968 TRUE 5 -0.5362834 #00C0BB
ENSG00000163635 0.7483507 TRUE 5 0.4727632 #F365E6
ENSG00000197694 0.7457768 FALSE None -0.5362834 #00C0BB
ENSG00000133958 0.7456077 TRUE 3 0.2020738 #00B70C
ENSG00000170043 0.7433856 FALSE None -0.5362834 #00C0BB
ENSG00000126705 0.7430874 TRUE 3 -0.5362834 #00C0BB
ENSG00000089012 0.7397676 FALSE None 0.4840783 #00B6EB
ENSG00000121297 0.7393241 TRUE 4 0.2020738 #00B70C
ENSG00000105409 0.7326351 TRUE 4 -0.5362834 #00C0BB
ENSG00000071242 0.7317059 TRUE 4 -0.5362834 #00C0BB
ENSG00000130477 0.7261053 TRUE 4 -0.9216750 #C17FFF
ENSG00000074054 0.7260603 TRUE 3 -0.4966667 #00C1A8
ENSG00000157014 0.7223243 FALSE None -0.5362834 #00C0BB
ENSG00000172292 0.7217194 FALSE None -0.9216750 #C17FFF
ENSG00000112640 0.7145476 TRUE 4 -0.6557529 #53B400
ENSG00000198752 0.7076024 TRUE 3 -0.5362834 #00C0BB
ENSG00000173809 0.7071743 FALSE None 0.3841921 #E88523
ENSG00000170485 0.7009354 TRUE 4 -0.5362834 #00C0BB
ENSG00000132821 0.6990407 FALSE None -0.6714074 #8195FF
ENSG00000156110 0.6976490 TRUE 4 -0.6714074 #8195FF
ENSG00000101842 0.6967189 FALSE None -0.8639848 #C49A00
ENSG00000139910 0.6955391 FALSE None -0.9216750 #C17FFF
ENSG00000136925 0.6954479 FALSE None 0.4840783 #00B6EB
ENSG00000117009 0.6933030 TRUE 5 0.4727632 #F365E6
ENSG00000107105 0.6913609 TRUE 4 -0.9216750 #C17FFF
ENSG00000136531 0.6906225 TRUE 1 -0.9216750 #C17FFF
ENSG00000135365 0.6869061 TRUE 4 0.2020738 #00B70C
ENSG00000187555 0.6831712 TRUE 2 -0.6557529 #53B400
ENSG00000137075 0.6816073 TRUE 4 -0.5362834 #00C0BB
ENSG00000100354 0.6782077 TRUE 2 0.4843988 #8EAB00
ENSG00000083544 0.6765330 FALSE None 0.2020738 #00B70C



Negative samples distribution

The top scores were affected the most (decreasing)

negative_set %>% ggplot(aes(prob, corrected_score)) + geom_point(alpha=0.2, color='#0099cc') + 
                 geom_abline(slope=1, intercept=0, color='#e6e6e6', linetype='dashed') + 
                 geom_smooth(color='#666666', alpha=0.5, se=FALSE, size=0.5) + coord_fixed() +
                 xlab('Original score') + ylab('Corrected score') + theme_minimal()

More genes are predicted as true as before (4975 before)

negative_set_table = negative_set %>% mutate(corrected_pred = corrected_score>0.5) %>%
                     apply_labels(corrected_score = 'Corrected Probability', 
                                  corrected_pred = 'Corrected Label Prediction')

cro(negative_set_table$corrected_pred)
 #Total 
 Corrected Label Prediction 
   FALSE  8966
   TRUE  5073
   #Total cases  14039
cat(paste0('\n', sum(negative_set$corrected_pred), ' genes are predicted as ASD-related'))
## 
## 0 genes are predicted as ASD-related

Probability and Gene Significance

The relation is not as strong as before in the highest scores, but it became stronger in the lowest scores, probably because they have a higher level of expression than the average gene, so their scores may have been increased.

This could mean that a linear fit is not the right way to fix this

negative_set %>% ggplot(aes(corrected_score, GS, color=MTcor)) + geom_point() + geom_smooth(method='loess', color='#666666') +
                 geom_line(stat='smooth', method='loess', color='#666666', alpha=0.5, size=1.2, aes(x=prob)) +
                 geom_hline(yintercept=mean(negative_set$GS), color='gray', linetype='dashed') +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + 
                 ggtitle('Relation between the Model\'s Corrected Score and Gene Significance') + theme_minimal()

Not as high as before in the highest scores, a bit higher on the lowest scores

The transparent verison of the trend line is the original trend line

negative_set %>% ggplot(aes(corrected_score, abs(GS), color=MTcor)) + geom_point() + 
                 geom_hline(yintercept=mean(negative_set$absGS), color='gray', linetype='dashed') + 
                 geom_smooth(method='loess', color='#666666') +
                 geom_line(stat='smooth', method='loess', color='#666666', alpha=0.4, size=1.2, aes(x=prob)) +
                 scale_color_gradientn(colours=c('#F8766D','white','#00BFC4')) + 
                 ggtitle('Relation between the Model\'s Corrected Score and Gene Significance') + theme_minimal()

Summarised version of score vs mean expression, plotting by module instead of by gene

The difference in the trend lines between this plot and the one above is that the one above takes all the points into consideration while this considers each module as an observation by itself, so the top one is strongly affected by big modules and the bottom one treats all modules the same

The transparent version of each point and line are the original values and trends before the bias correction

plot_data = negative_set %>% group_by(MTcor) %>% summarise(mean = mean(prob), sd = sd(prob),
              new_mean = mean(corrected_score), new_sd = sd(corrected_score), n = n()) %>%
            mutate(MTcor_sign = ifelse(MTcor>0, 'Positive', 'Negative')) %>% left_join(original_dataset, by='MTcor') %>%
            dplyr::select(matches(clustering_selected), MTcor, MTcor_sign, mean, new_mean, sd, new_sd, n) %>% distinct()
colnames(plot_data)[1] = 'ID'

ggplotly(plot_data %>% ggplot(aes(MTcor, new_mean, size=n, color=MTcor_sign)) + geom_point(aes(id=ID)) + 
         geom_smooth(method='loess', color='gray', se=FALSE) + geom_smooth(method='lm', se=FALSE) + 
         geom_point(aes(y=mean), alpha=0.3) + theme(legend.position='none') +
         geom_line(stat='smooth', method='loess', color='gray', se=FALSE, alpha=0.3, size=1.2, aes(y=mean)) + 
         geom_line(stat='smooth', method='lm', se=FALSE, alpha=0.3, size=1.2, aes(y=mean)) + 
         xlab('Module-Diagnosis correlation') + ylab('Mean Corrected Score by the Model') + theme_minimal())

Probability and level of expression

Check if correcting by gene also corrected by module

Yes, but just a bit

mean_and_sd = data.frame(ID=rownames(datExpr), meanExpr=rowMeans(datExpr), sdExpr=apply(datExpr,1,sd))

plot_data = negative_set %>% mutate(ID=rownames(negative_set)) %>% left_join(mean_and_sd, by='ID') %>% 
            left_join(original_dataset %>% mutate(ID=rownames(original_dataset)) %>% 
                      dplyr::select(ID, matches(clustering_selected)), by='ID')
colnames(plot_data)[ncol(plot_data)] = 'Module'

plot_data2 = plot_data %>% group_by(Module) %>% summarise(meanExpr = mean(meanExpr), meanProb = mean(prob), 
                                                          new_meanProb = mean(corrected_score), n=n())

ggplotly(plot_data2 %>% ggplot(aes(meanExpr, new_meanProb, size=n)) + 
         geom_point(color=plot_data2$Module) + geom_point(color=plot_data2$Module, alpha=0.3, aes(y=meanProb)) + 
         geom_smooth(method='loess', se=TRUE, color='gray', alpha=0.1, size=0.7) + 
         geom_line(stat='smooth', method='loess', se=TRUE, color='gray', alpha=0.4, size=1.2, aes(y=meanProb)) +
         theme_minimal() + theme(legend.position='none') + 
         ggtitle('Mean expression vs corrected Model score by Module'))
rm(plot_data2, mean_and_sd)

Probability and SD of level of expression

The relation between SD and score became bigger than before (???)

plot_data %>% filter(sdExpr<0.5) %>% ggplot(aes(sdExpr, corrected_score)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) +
              geom_line(stat='smooth', method='lm', color='#999999', se=FALSE, alpha=0.4, size=1.5, aes(y=prob)) + 
              theme_minimal() + ggtitle('SD vs model probability by gene')

This approximation curve looks like the opposite of the trend found between mean/sd and model scores

plot_data %>% ggplot(aes(meanExpr, sdExpr)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) + 
              geom_smooth(method='lm', color='#999999', se=FALSE, alpha=1) + 
              scale_x_log10() + scale_y_log10() +
              theme_minimal() + ggtitle('Mean expression vs SD by gene')

Probability and lfc

This relation improved a lot, so the model seems to be getting better

plot_data = negative_set %>% mutate(ID=rownames(negative_set)) %>% 
            left_join(DE_info %>% mutate(ID=rownames(DE_info)), by='ID')

plot_data %>% filter(abs(log2FoldChange)<10) %>%
              ggplot(aes(log2FoldChange, corrected_score)) + geom_point(alpha=0.1, color='#0099cc') + 
              geom_smooth(method='loess', color='gray', alpha=0.3) +
              geom_line(stat='smooth', method='loess', color='gray', alpha=0.3, size=1.5, aes(y=prob)) +
              theme_minimal() + ggtitle('lfc vs model probability by gene')



Conclusion

Conclusion: This bias correction seems to be working up to a certain level, but there are still some non-linear patterns that remain and the relation with the SD that seems to have increased